{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train a Deep NN to predict Asset Price movements"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports & Settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:10:01.348839Z",
     "start_time": "2020-06-21T23:10:01.345834Z"
    }
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:10:01.573078Z",
     "start_time": "2020-06-21T23:10:01.351361Z"
    }
   },
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:10:01.576526Z",
     "start_time": "2020-06-21T23:10:01.574420Z"
    }
   },
   "outputs": [],
   "source": [
    "np.random.seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:10:01.586396Z",
     "start_time": "2020-06-21T23:10:01.577729Z"
    }
   },
   "outputs": [],
   "source": [
    "idx = pd.IndexSlice"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build daily dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:10:01.594754Z",
     "start_time": "2020-06-21T23:10:01.587559Z"
    }
   },
   "outputs": [],
   "source": [
    "DATA_DIR = Path('..', 'data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:10:04.005540Z",
     "start_time": "2020-06-21T23:10:01.595821Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 5698754 entries, (Timestamp('2010-01-04 00:00:00'), 'A') to (Timestamp('2017-12-29 00:00:00'), 'ZUMZ')\n",
      "Data columns (total 2 columns):\n",
      " #   Column      Dtype  \n",
      "---  ------      -----  \n",
      " 0   adj_close   float64\n",
      " 1   adj_volume  float64\n",
      "dtypes: float64(2)\n",
      "memory usage: 108.8+ MB\n"
     ]
    }
   ],
   "source": [
    "prices = (pd.read_hdf(DATA_DIR / 'assets.h5', 'quandl/wiki/prices')\n",
    "          .loc[idx['2010':'2017', :], ['adj_close', 'adj_volume']])\n",
    "prices.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Select most traded stocks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:10:05.384637Z",
     "start_time": "2020-06-21T23:10:04.007703Z"
    }
   },
   "outputs": [],
   "source": [
    "n_dates = len(prices.index.unique('date'))\n",
    "dollar_vol = (prices.adj_close.mul(prices.adj_volume)\n",
    "              .unstack('ticker')\n",
    "              .dropna(thresh=int(.95 * n_dates), axis=1)\n",
    "              .rank(ascending=False, axis=1)\n",
    "              .stack('ticker'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:10:05.729173Z",
     "start_time": "2020-06-21T23:10:05.386448Z"
    }
   },
   "outputs": [],
   "source": [
    "most_traded = dollar_vol.groupby(level='ticker').mean().nsmallest(500).index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:10:09.380040Z",
     "start_time": "2020-06-21T23:10:05.730667Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "DatetimeIndex: 2013 entries, 2017-12-29 to 2010-01-04\n",
      "Columns: 500 entries, A to CHTR\n",
      "dtypes: float64(500)\n",
      "memory usage: 7.7 MB\n"
     ]
    }
   ],
   "source": [
    "returns = (prices.loc[idx[:, most_traded], 'adj_close']\n",
    "           .unstack('ticker')\n",
    "           .pct_change()\n",
    "           .sort_index(ascending=False))\n",
    "returns.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Stack 21-day time series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:10:09.384456Z",
     "start_time": "2020-06-21T23:10:09.381276Z"
    }
   },
   "outputs": [],
   "source": [
    "n = len(returns)\n",
    "T = 21 # days\n",
    "tcols = list(range(T))\n",
    "tickers = returns.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:12:08.043887Z",
     "start_time": "2020-06-21T23:10:09.386218Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 995499 entries, ('A', Timestamp('2010-02-04 00:00:00')) to ('ZION', Timestamp('2017-12-29 00:00:00'))\n",
      "Data columns (total 22 columns):\n",
      " #   Column  Non-Null Count   Dtype  \n",
      "---  ------  --------------   -----  \n",
      " 0   label   995499 non-null  float64\n",
      " 1   1       995499 non-null  float64\n",
      " 2   2       995499 non-null  float64\n",
      " 3   3       995499 non-null  float64\n",
      " 4   4       995499 non-null  float64\n",
      " 5   5       995499 non-null  float64\n",
      " 6   6       995499 non-null  float64\n",
      " 7   7       995499 non-null  float64\n",
      " 8   8       995499 non-null  float64\n",
      " 9   9       995499 non-null  float64\n",
      " 10  10      995499 non-null  float64\n",
      " 11  11      995499 non-null  float64\n",
      " 12  12      995499 non-null  float64\n",
      " 13  13      995499 non-null  float64\n",
      " 14  14      995499 non-null  float64\n",
      " 15  15      995499 non-null  float64\n",
      " 16  16      995499 non-null  float64\n",
      " 17  17      995499 non-null  float64\n",
      " 18  18      995499 non-null  float64\n",
      " 19  19      995499 non-null  float64\n",
      " 20  20      995499 non-null  float64\n",
      " 21  21      995499 non-null  float64\n",
      "dtypes: float64(22)\n",
      "memory usage: 210.9+ MB\n"
     ]
    }
   ],
   "source": [
    "data = pd.DataFrame()\n",
    "for i in range(n-T-1):\n",
    "    df = returns.iloc[i:i+T+1]\n",
    "    date = df.index.max()\n",
    "    data = pd.concat([data, \n",
    "                      df.reset_index(drop=True).T\n",
    "                      .assign(date=date, ticker=tickers)\n",
    "                      .set_index(['ticker', 'date'])])\n",
    "data = data.rename(columns={0: 'label'}).sort_index().dropna()\n",
    "data.loc[:, tcols[1:]] = (data.loc[:, tcols[1:]].apply(lambda x: x.clip(lower=x.quantile(.01),\n",
    "                                                  upper=x.quantile(.99))))\n",
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:12:08.055150Z",
     "start_time": "2020-06-21T23:12:08.045482Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(995499, 22)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:12:08.233063Z",
     "start_time": "2020-06-21T23:12:08.057261Z"
    }
   },
   "outputs": [],
   "source": [
    "data.to_hdf('data.h5', 'returns_daily')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build weekly dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We load the Quandl adjusted stock price data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:12:12.958144Z",
     "start_time": "2020-06-21T23:12:08.234215Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "DatetimeIndex: 2896 entries, 2007-01-01 to 2018-03-27\n",
      "Columns: 3199 entries, A to ZUMZ\n",
      "dtypes: float64(3199)\n",
      "memory usage: 70.7 MB\n"
     ]
    }
   ],
   "source": [
    "prices = (pd.read_hdf(DATA_DIR / 'assets.h5', 'quandl/wiki/prices')\n",
    "          .adj_close\n",
    "          .unstack().loc['2007':])\n",
    "prices.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Resample to weekly frequency"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We start by generating weekly returns for close to 2,500 stocks without missing data for the 2008-17 period, as follows:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:12:13.141240Z",
     "start_time": "2020-06-21T23:12:12.960275Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "DatetimeIndex: 522 entries, 2017-12-31 to 2008-01-06\n",
      "Freq: -1W-SUN\n",
      "Columns: 2489 entries, A to ZUMZ\n",
      "dtypes: float64(2489)\n",
      "memory usage: 9.9 MB\n"
     ]
    }
   ],
   "source": [
    "returns = (prices\n",
    "           .resample('W')\n",
    "           .last()\n",
    "           .pct_change()\n",
    "           .loc['2008': '2017']\n",
    "           .dropna(axis=1)\n",
    "           .sort_index(ascending=False))\n",
    "returns.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:12:13.168744Z",
     "start_time": "2020-06-21T23:12:13.142933Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>ticker</th>\n",
       "      <th>A</th>\n",
       "      <th>AAL</th>\n",
       "      <th>AAN</th>\n",
       "      <th>AAON</th>\n",
       "      <th>AAP</th>\n",
       "      <th>AAPL</th>\n",
       "      <th>AAWW</th>\n",
       "      <th>ABAX</th>\n",
       "      <th>ABC</th>\n",
       "      <th>ABCB</th>\n",
       "      <th>...</th>\n",
       "      <th>ZEUS</th>\n",
       "      <th>ZIGO</th>\n",
       "      <th>ZINC</th>\n",
       "      <th>ZION</th>\n",
       "      <th>ZIOP</th>\n",
       "      <th>ZIXI</th>\n",
       "      <th>ZLC</th>\n",
       "      <th>ZMH</th>\n",
       "      <th>ZQK</th>\n",
       "      <th>ZUMZ</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>date</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2017-12-31</th>\n",
       "      <td>-0.005642</td>\n",
       "      <td>-0.010648</td>\n",
       "      <td>-0.010184</td>\n",
       "      <td>-0.001361</td>\n",
       "      <td>-0.008553</td>\n",
       "      <td>-0.033027</td>\n",
       "      <td>-0.024938</td>\n",
       "      <td>-0.001814</td>\n",
       "      <td>-0.006922</td>\n",
       "      <td>-0.019329</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.029797</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.009741</td>\n",
       "      <td>0.022222</td>\n",
       "      <td>-0.015730</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.029138</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2017-12-24</th>\n",
       "      <td>-0.003846</td>\n",
       "      <td>0.029965</td>\n",
       "      <td>0.090171</td>\n",
       "      <td>0.044034</td>\n",
       "      <td>-0.001490</td>\n",
       "      <td>0.006557</td>\n",
       "      <td>0.046087</td>\n",
       "      <td>0.032681</td>\n",
       "      <td>-0.007620</td>\n",
       "      <td>0.017598</td>\n",
       "      <td>...</td>\n",
       "      <td>0.032153</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.026395</td>\n",
       "      <td>-0.068966</td>\n",
       "      <td>-0.024123</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.067164</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2017-12-17</th>\n",
       "      <td>0.003413</td>\n",
       "      <td>0.000784</td>\n",
       "      <td>-0.052591</td>\n",
       "      <td>-0.014006</td>\n",
       "      <td>0.003888</td>\n",
       "      <td>0.026569</td>\n",
       "      <td>0.004367</td>\n",
       "      <td>0.008396</td>\n",
       "      <td>0.074625</td>\n",
       "      <td>0.026567</td>\n",
       "      <td>...</td>\n",
       "      <td>0.036715</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.018064</td>\n",
       "      <td>-0.018059</td>\n",
       "      <td>0.075472</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>-0.051887</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2017-12-10</th>\n",
       "      <td>-0.019071</td>\n",
       "      <td>0.041012</td>\n",
       "      <td>-0.005359</td>\n",
       "      <td>-0.017882</td>\n",
       "      <td>0.010375</td>\n",
       "      <td>-0.009822</td>\n",
       "      <td>-0.028014</td>\n",
       "      <td>-0.010386</td>\n",
       "      <td>0.020600</td>\n",
       "      <td>-0.054271</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.002410</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.016973</td>\n",
       "      <td>-0.015556</td>\n",
       "      <td>-0.055679</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.062657</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2017-12-03</th>\n",
       "      <td>-0.009660</td>\n",
       "      <td>0.009267</td>\n",
       "      <td>0.105501</td>\n",
       "      <td>0.013947</td>\n",
       "      <td>0.112630</td>\n",
       "      <td>-0.022404</td>\n",
       "      <td>0.073838</td>\n",
       "      <td>-0.028456</td>\n",
       "      <td>0.045796</td>\n",
       "      <td>0.024717</td>\n",
       "      <td>...</td>\n",
       "      <td>0.065742</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.080475</td>\n",
       "      <td>0.014656</td>\n",
       "      <td>-0.006637</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.047244</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2008-02-03</th>\n",
       "      <td>0.038265</td>\n",
       "      <td>0.252238</td>\n",
       "      <td>0.002941</td>\n",
       "      <td>0.095182</td>\n",
       "      <td>0.097833</td>\n",
       "      <td>0.028767</td>\n",
       "      <td>0.006245</td>\n",
       "      <td>-0.078058</td>\n",
       "      <td>0.036913</td>\n",
       "      <td>0.083217</td>\n",
       "      <td>...</td>\n",
       "      <td>0.137066</td>\n",
       "      <td>0.127561</td>\n",
       "      <td>0.286550</td>\n",
       "      <td>0.167722</td>\n",
       "      <td>-0.087879</td>\n",
       "      <td>0.069364</td>\n",
       "      <td>0.171949</td>\n",
       "      <td>0.193189</td>\n",
       "      <td>0.127811</td>\n",
       "      <td>0.149083</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2008-01-27</th>\n",
       "      <td>-0.013963</td>\n",
       "      <td>-0.048762</td>\n",
       "      <td>0.191310</td>\n",
       "      <td>0.071788</td>\n",
       "      <td>0.043997</td>\n",
       "      <td>-0.194286</td>\n",
       "      <td>-0.008984</td>\n",
       "      <td>-0.090807</td>\n",
       "      <td>-0.034771</td>\n",
       "      <td>0.054572</td>\n",
       "      <td>...</td>\n",
       "      <td>0.018349</td>\n",
       "      <td>-0.026292</td>\n",
       "      <td>-0.046975</td>\n",
       "      <td>0.136418</td>\n",
       "      <td>-0.003021</td>\n",
       "      <td>0.145695</td>\n",
       "      <td>0.042164</td>\n",
       "      <td>-0.014553</td>\n",
       "      <td>0.141892</td>\n",
       "      <td>0.118666</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2008-01-20</th>\n",
       "      <td>-0.065000</td>\n",
       "      <td>0.086627</td>\n",
       "      <td>-0.080541</td>\n",
       "      <td>-0.054762</td>\n",
       "      <td>-0.007176</td>\n",
       "      <td>-0.065609</td>\n",
       "      <td>0.015818</td>\n",
       "      <td>-0.019721</td>\n",
       "      <td>-0.015219</td>\n",
       "      <td>-0.044397</td>\n",
       "      <td>...</td>\n",
       "      <td>0.040573</td>\n",
       "      <td>0.010999</td>\n",
       "      <td>-0.167109</td>\n",
       "      <td>-0.051614</td>\n",
       "      <td>-0.054286</td>\n",
       "      <td>-0.124638</td>\n",
       "      <td>0.037172</td>\n",
       "      <td>-0.037312</td>\n",
       "      <td>-0.030144</td>\n",
       "      <td>-0.076969</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2008-01-13</th>\n",
       "      <td>0.035375</td>\n",
       "      <td>-0.041902</td>\n",
       "      <td>-0.037818</td>\n",
       "      <td>-0.046538</td>\n",
       "      <td>-0.101486</td>\n",
       "      <td>-0.040878</td>\n",
       "      <td>-0.052095</td>\n",
       "      <td>0.097385</td>\n",
       "      <td>0.080137</td>\n",
       "      <td>-0.017313</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.054176</td>\n",
       "      <td>-0.047993</td>\n",
       "      <td>-0.102381</td>\n",
       "      <td>0.037264</td>\n",
       "      <td>-0.022346</td>\n",
       "      <td>-0.172662</td>\n",
       "      <td>0.011799</td>\n",
       "      <td>0.051880</td>\n",
       "      <td>0.018692</td>\n",
       "      <td>-0.094249</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2008-01-06</th>\n",
       "      <td>-0.072553</td>\n",
       "      <td>-0.156356</td>\n",
       "      <td>-0.068707</td>\n",
       "      <td>-0.133301</td>\n",
       "      <td>-0.065496</td>\n",
       "      <td>-0.098984</td>\n",
       "      <td>-0.029478</td>\n",
       "      <td>-0.098374</td>\n",
       "      <td>-0.037363</td>\n",
       "      <td>-0.132733</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.027290</td>\n",
       "      <td>-0.075806</td>\n",
       "      <td>-0.004739</td>\n",
       "      <td>-0.081058</td>\n",
       "      <td>0.101538</td>\n",
       "      <td>-0.143737</td>\n",
       "      <td>-0.134100</td>\n",
       "      <td>0.000752</td>\n",
       "      <td>-0.133102</td>\n",
       "      <td>-0.269012</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10 rows × 2489 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "ticker             A       AAL       AAN      AAON       AAP      AAPL  \\\n",
       "date                                                                     \n",
       "2017-12-31 -0.005642 -0.010648 -0.010184 -0.001361 -0.008553 -0.033027   \n",
       "2017-12-24 -0.003846  0.029965  0.090171  0.044034 -0.001490  0.006557   \n",
       "2017-12-17  0.003413  0.000784 -0.052591 -0.014006  0.003888  0.026569   \n",
       "2017-12-10 -0.019071  0.041012 -0.005359 -0.017882  0.010375 -0.009822   \n",
       "2017-12-03 -0.009660  0.009267  0.105501  0.013947  0.112630 -0.022404   \n",
       "2008-02-03  0.038265  0.252238  0.002941  0.095182  0.097833  0.028767   \n",
       "2008-01-27 -0.013963 -0.048762  0.191310  0.071788  0.043997 -0.194286   \n",
       "2008-01-20 -0.065000  0.086627 -0.080541 -0.054762 -0.007176 -0.065609   \n",
       "2008-01-13  0.035375 -0.041902 -0.037818 -0.046538 -0.101486 -0.040878   \n",
       "2008-01-06 -0.072553 -0.156356 -0.068707 -0.133301 -0.065496 -0.098984   \n",
       "\n",
       "ticker          AAWW      ABAX       ABC      ABCB  ...      ZEUS      ZIGO  \\\n",
       "date                                                ...                       \n",
       "2017-12-31 -0.024938 -0.001814 -0.006922 -0.019329  ... -0.029797  0.000000   \n",
       "2017-12-24  0.046087  0.032681 -0.007620  0.017598  ...  0.032153  0.000000   \n",
       "2017-12-17  0.004367  0.008396  0.074625  0.026567  ...  0.036715  0.000000   \n",
       "2017-12-10 -0.028014 -0.010386  0.020600 -0.054271  ... -0.002410  0.000000   \n",
       "2017-12-03  0.073838 -0.028456  0.045796  0.024717  ...  0.065742  0.000000   \n",
       "2008-02-03  0.006245 -0.078058  0.036913  0.083217  ...  0.137066  0.127561   \n",
       "2008-01-27 -0.008984 -0.090807 -0.034771  0.054572  ...  0.018349 -0.026292   \n",
       "2008-01-20  0.015818 -0.019721 -0.015219 -0.044397  ...  0.040573  0.010999   \n",
       "2008-01-13 -0.052095  0.097385  0.080137 -0.017313  ... -0.054176 -0.047993   \n",
       "2008-01-06 -0.029478 -0.098374 -0.037363 -0.132733  ... -0.027290 -0.075806   \n",
       "\n",
       "ticker          ZINC      ZION      ZIOP      ZIXI       ZLC       ZMH  \\\n",
       "date                                                                     \n",
       "2017-12-31  0.000000 -0.009741  0.022222 -0.015730  0.000000  0.000000   \n",
       "2017-12-24  0.000000  0.026395 -0.068966 -0.024123  0.000000  0.000000   \n",
       "2017-12-17  0.000000 -0.018064 -0.018059  0.075472  0.000000  0.000000   \n",
       "2017-12-10  0.000000  0.016973 -0.015556 -0.055679  0.000000  0.000000   \n",
       "2017-12-03  0.000000  0.080475  0.014656 -0.006637  0.000000  0.000000   \n",
       "2008-02-03  0.286550  0.167722 -0.087879  0.069364  0.171949  0.193189   \n",
       "2008-01-27 -0.046975  0.136418 -0.003021  0.145695  0.042164 -0.014553   \n",
       "2008-01-20 -0.167109 -0.051614 -0.054286 -0.124638  0.037172 -0.037312   \n",
       "2008-01-13 -0.102381  0.037264 -0.022346 -0.172662  0.011799  0.051880   \n",
       "2008-01-06 -0.004739 -0.081058  0.101538 -0.143737 -0.134100  0.000752   \n",
       "\n",
       "ticker           ZQK      ZUMZ  \n",
       "date                            \n",
       "2017-12-31  0.000000 -0.029138  \n",
       "2017-12-24  0.000000  0.067164  \n",
       "2017-12-17  0.000000 -0.051887  \n",
       "2017-12-10  0.000000  0.062657  \n",
       "2017-12-03  0.000000  0.047244  \n",
       "2008-02-03  0.127811  0.149083  \n",
       "2008-01-27  0.141892  0.118666  \n",
       "2008-01-20 -0.030144 -0.076969  \n",
       "2008-01-13  0.018692 -0.094249  \n",
       "2008-01-06 -0.133102 -0.269012  \n",
       "\n",
       "[10 rows x 2489 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "returns.head().append(returns.tail())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create & stack 52-week sequences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We'll use 52-week sequences, which we'll create in a stacked format:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:12:13.172574Z",
     "start_time": "2020-06-21T23:12:13.170195Z"
    }
   },
   "outputs": [],
   "source": [
    "n = len(returns)\n",
    "T = 52 # weeks\n",
    "tcols = list(range(T))\n",
    "tickers = returns.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:12:57.482592Z",
     "start_time": "2020-06-21T23:12:13.174386Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 1167341 entries, ('A', Timestamp('2017-12-31 00:00:00')) to ('ZUMZ', Timestamp('2009-01-11 00:00:00'))\n",
      "Data columns (total 53 columns):\n",
      " #   Column  Non-Null Count    Dtype  \n",
      "---  ------  --------------    -----  \n",
      " 0   0       1167341 non-null  float64\n",
      " 1   1       1167341 non-null  float64\n",
      " 2   2       1167341 non-null  float64\n",
      " 3   3       1167341 non-null  float64\n",
      " 4   4       1167341 non-null  float64\n",
      " 5   5       1167341 non-null  float64\n",
      " 6   6       1167341 non-null  float64\n",
      " 7   7       1167341 non-null  float64\n",
      " 8   8       1167341 non-null  float64\n",
      " 9   9       1167341 non-null  float64\n",
      " 10  10      1167341 non-null  float64\n",
      " 11  11      1167341 non-null  float64\n",
      " 12  12      1167341 non-null  float64\n",
      " 13  13      1167341 non-null  float64\n",
      " 14  14      1167341 non-null  float64\n",
      " 15  15      1167341 non-null  float64\n",
      " 16  16      1167341 non-null  float64\n",
      " 17  17      1167341 non-null  float64\n",
      " 18  18      1167341 non-null  float64\n",
      " 19  19      1167341 non-null  float64\n",
      " 20  20      1167341 non-null  float64\n",
      " 21  21      1167341 non-null  float64\n",
      " 22  22      1167341 non-null  float64\n",
      " 23  23      1167341 non-null  float64\n",
      " 24  24      1167341 non-null  float64\n",
      " 25  25      1167341 non-null  float64\n",
      " 26  26      1167341 non-null  float64\n",
      " 27  27      1167341 non-null  float64\n",
      " 28  28      1167341 non-null  float64\n",
      " 29  29      1167341 non-null  float64\n",
      " 30  30      1167341 non-null  float64\n",
      " 31  31      1167341 non-null  float64\n",
      " 32  32      1167341 non-null  float64\n",
      " 33  33      1167341 non-null  float64\n",
      " 34  34      1167341 non-null  float64\n",
      " 35  35      1167341 non-null  float64\n",
      " 36  36      1167341 non-null  float64\n",
      " 37  37      1167341 non-null  float64\n",
      " 38  38      1167341 non-null  float64\n",
      " 39  39      1167341 non-null  float64\n",
      " 40  40      1167341 non-null  float64\n",
      " 41  41      1167341 non-null  float64\n",
      " 42  42      1167341 non-null  float64\n",
      " 43  43      1167341 non-null  float64\n",
      " 44  44      1167341 non-null  float64\n",
      " 45  45      1167341 non-null  float64\n",
      " 46  46      1167341 non-null  float64\n",
      " 47  47      1167341 non-null  float64\n",
      " 48  48      1167341 non-null  float64\n",
      " 49  49      1167341 non-null  float64\n",
      " 50  50      1167341 non-null  float64\n",
      " 51  51      1167341 non-null  float64\n",
      " 52  52      1167341 non-null  float64\n",
      "dtypes: float64(53)\n",
      "memory usage: 476.5+ MB\n"
     ]
    }
   ],
   "source": [
    "data = pd.DataFrame()\n",
    "for i in range(n-T-1):\n",
    "    df = returns.iloc[i:i+T+1]\n",
    "    date = df.index.max()    \n",
    "    data = pd.concat([data, (df.reset_index(drop=True).T\n",
    "                             .assign(date=date, ticker=tickers)\n",
    "                             .set_index(['ticker', 'date']))])\n",
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:13:06.958893Z",
     "start_time": "2020-06-21T23:12:57.483953Z"
    }
   },
   "outputs": [],
   "source": [
    "data[tcols] = (data[tcols].apply(lambda x: x.clip(lower=x.quantile(.01),\n",
    "                                                  upper=x.quantile(.99))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:13:07.140397Z",
     "start_time": "2020-06-21T23:13:06.960091Z"
    }
   },
   "outputs": [],
   "source": [
    "data = data.rename(columns={0: 'fwd_returns'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:13:07.147259Z",
     "start_time": "2020-06-21T23:13:07.141452Z"
    }
   },
   "outputs": [],
   "source": [
    "data['label'] = (data['fwd_returns'] > 0).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:13:07.154688Z",
     "start_time": "2020-06-21T23:13:07.149452Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1167341, 54)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T23:13:08.172023Z",
     "start_time": "2020-06-21T23:13:07.155929Z"
    }
   },
   "outputs": [],
   "source": [
    "data.sort_index().to_hdf('data.h5', 'returns_weekly')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:ml4t-dl]",
   "language": "python",
   "name": "conda-env-ml4t-dl-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "282.222px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
